Link to Git repo: https://github.com/nktang05/kibera.git

Link to Analysis: https://nktang05.github.io/kibera/KiberaAnalysis.html

# dafsdasdf
print("hello world")
## [1] "hello world"

Data Cleaning

# read in data
setwd("~/Desktop/GRIT/Kibera")
data <- fread("~/Desktop/GRIT/Kibera/kibera_values_data.csv", header = TRUE)
# get rid on unnecessary data columns
data$StartDate<- NULL
data$EndDate<- NULL
data$Status<- NULL
data$IPAddress<- NULL
data$Progress<- NULL
data$'Duration (in seconds)'<- NULL
data$Finished<- NULL
data$ RecordedDate<- NULL
data$ResponseId<- NULL
data$RecipientLastName<- NULL
data$RecipientFirstName<- NULL
data$RecipientEmail<- NULL
data$ExternalReference<- NULL
data$LocationLatitude<- NULL
data$LocationLongitude<- NULL
data$DistributionChannel<- NULL
data$UserLanguage<- NULL
data$"2.11_7_TEXT"<- NULL
data$"2.13_7_TEXT"<- NULL
data$"2.20_5_TEXT"<- NULL
data$"3.2_8_TEXT"<- NULL
data$"3.16_6_TEXT"<- NULL
data$"4.21_5_TEXT"<- NULL
data$"5.1_5_TEXT"<- NULL
data$"5.12_6_TEXT"<- NULL


#set aside variable labels
variable_labels <- as.character(unlist(data[1, ]))
# drop non data rows
data <- data[-c(1, 2), ]

# change names of 
names(data) <- ifelse(grepl("^[0-9]", names(data)),
                      paste0("x", names(data)),
                      names(data))
# make var numeric
numericVars <- c("x1.1", "x1.2", "x1.3", "x2.1", "x3.1_1_TEXT", "x3.9" )
for (col in numericVars) {
  data[[col]] <- as.numeric(as.character(data[[col]]))
}
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
# make var date
data$x1.4 <- as.Date(data$x1.4, format = "%d/%m/%Y")

# make var string
charVars <- c("x2.5_5_TEXT" , "x2.7_6_TEXT", "x2.10_8_TEXT", "x2.12_10_TEXT", "x2.14_5_TEXT", "x2.16_7_TEXT", "x2.21_9_TEXT", "x3.3_7_TEXT", "x3.13_8_TEXT", 
              "x3.14_7_TEXT", "x3.17_7_TEXT", "x4.10_1_TEXT", "x7.9_6_TEXT") 

for (col in charVars) {
  data[[col]] <- as.character(data[[col]])
}


#for (col in factorVars) {
  #data[[col]] <- as.factor(data[[col]])
#}


#names(data)
#summary(data)
# delete over 20 and under 13
data <- data[data$'x2.1' <= 20, ]
data <- data[data$'x2.1' >= 13, ]



# Remove rows where gender is NA or an empty string
data <- data[!is.na(`x2.2`) & `x2.2` != "", ]

# delete in age is is NA or an empty string
data <- data[!is.na(`x2.1`) & `x2.1` != "", ]

Check for weird conditionals

#CHECK FOR MALE CONDITIONALS
#change pregnant to NA if indicated Yes and Male
data$'x4.8'[data$'x2.2' == 2 & data$'x4.8' == 1] <- NA

#change menstual to NA if indicated Yes and Male
data$'x3.1'[data$'x2.2' == 2 & (data[["x3.1"]] == 1 | data[["x3.1"]] == 2)] <- NA
# change menstrual age to NA if age and Male
data[["x3.1_1_TEXT"]][data[["x2.2"]] == 2 & data[["x3.1_1_TEXT"]] != ""] <- NA
# change menstrual predict to NA if age and Male
data[["x3.2"]][data[["x2.2"]] == 2 & data[["x3.2"]] != ""] <- NA
data[["x3.3"]][data[["x2.2"]] == 2 & data[["x3.3"]] != ""] <- NA
data[["x3.4"]][data[["x2.2"]] == 2 & data[["x3.4"]] != ""] <- NA
data[["x3.5"]][data[["x2.2"]] == 2 & data[["x3.5"]] != ""] <- NA
data[["x3.6"]][data[["x2.2"]] == 2 & data[["x3.6"]] != ""] <- NA
data[["x3.7"]][data[["x2.2"]] == 2 & data[["x3.7"]] != ""] <- NA
#CHECK FOR SEX ACTIVITY CONDITIONALS
data[["x3.9"]][data[["x2.2"]] != 1 & data[["x3.9"]] != ""] <- NA
#query to see duplicate village numbers
sqldf("SELECT [x1.2], COUNT(*) as count 
       FROM data 
       GROUP BY [x1.2] 
       HAVING COUNT(*) > 1")
##    x1.2 count
## 1    NA    18
## 2     1     2
## 3    17     2
## 4    30     3
## 5   202     2
## 6   205     2
## 7   207     2
## 8   208     2
## 9   209     2
## 10  210     2
## 11  211     2
## 12  265     2
## 13  270     2
## 14  271     2
## 15  436     2
## 16  444     2
## 17  451     2
## 18  456     2
## 19  543     3
## 20  607     2
#query checker for if male said they were pregnant
sqldf("SELECT [x2.2], [x4.8] , [x1.2]
      FROM data 
      WHERE [x2.2] = 2 AND [x4.8] = 1")
## [1] x2.2 x4.8 x1.2
## <0 rows> (or 0-length row.names)
#query for male menstration
sqldf("SELECT [x2.2], [x3.1] 
      FROM data 
      WHERE [x2.2] = 2 AND ([x3.1] = 1 OR [x3.1] = 2)")
## [1] x2.2 x3.1
## <0 rows> (or 0-length row.names)
#query for male menstration age
sqldf("SELECT [x2.2], [x3.1_1_TEXT]
       FROM data 
       WHERE [x2.2] = 2 AND [x3.1_1_TEXT] != ''")
## [1] x2.2        x3.1_1_TEXT
## <0 rows> (or 0-length row.names)
sqldf("SELECT [x2.2], [x3.2]
       FROM data 
       WHERE [x2.2] = 2 AND [x3.2] != ''")
## [1] x2.2 x3.2
## <0 rows> (or 0-length row.names)
#3.3,3.4,3.5,3.6,3.7
sqldf("SELECT [x2.2], [x3.7]
       FROM data 
       WHERE [x2.2] = 2 AND [x3.7] != ''")
## [1] x2.2 x3.7
## <0 rows> (or 0-length row.names)
# query for sex conditionals
sqldf("SELECT [x3.8], [x3.9]
       FROM data 
       WHERE [x3.8] != 1 AND [x3.9] != ''")
## [1] x3.8 x3.9
## <0 rows> (or 0-length row.names)

Make Codebook

#make variable codebook
codebook_output <- codebook(data)

# make label variables
for (i in seq_along(data)) {
  var_label(data[[i]]) <- variable_labels[i]
}
#test for success
#var_label(data)
# THIS IS WHERE MORE LABELS NEED TO BE HARDCODED IN
data$x2.2 <- labelled(
  x = as.integer(data$x2.2),
  labels = c("Female" = 1, "Male" = 2)
)
# make factor variables after codebook
allVars <- names(data)
excludeVars <- c(numericVars, charVars, "x1.4")
factorVars <- setdiff(allVars, excludeVars)


for (col in factorVars) {
  data[[col]] <- to_factor(data[[col]], levels = "labels")
}

#check
#class(data$x2.2)
#levels(data$x2.2) 
#summary(data)

#summary(data$x2.2)

Export clean data and codebook

# write new csv of clean data
fwrite(data, "kibera_values_cleaned.csv")

#write codebook
saveRDS(data, file = "codebook.rds")